rm(list=ls(all=TRUE))
stages_frog_match <- c("unfertilised","1-cell","16-cell","110-cell","late neurula","mid-tailbud","late-tailbud","larva")
stages_frog_merged = c("oo12","oo34","oo56","egg","st02","st05","st08","st09","st10","st11","st12","st13","st15","st17","st19","st20","st21","st23","st25","st26","st28","st30","st31","st35","st37-38","st40","st43","st48","st61","st66")
stages_frog_order_all = c("oo12","oo34","oo56","egg","egg","st02","st02","st05","st05","st08","st08","st08","st09","st09","st09","st09","st10","st10","st10","st11","st11","st12","st12","st13","st13","st15","st15","st17","st17","st19","st19","st20","st20","st21","st21","st23","st23","st25","st25","st26","st26","st28","st28","st30","st30","st31","st31","st35","st35","st35","st37-38","st37-38","st40","st40","st43","st43","st48","st48","st61","st61","st66","st66")
theme_personalised <- theme_pubr(base_family = "Avenir") +
theme(panel.background=element_blank(),
panel.border=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
plot.background=element_blank())
heatmap_function <- function(matrix,title, legend) {
# Calculate min & max
min_value <- min(matrix)
max_value<- max(matrix)
middle_value <- (min_value + max_value) / 2
# color scale
color_scale <- colorRamp2(c(min_value, middle_value, max_value), c(
"#A7D3D4FF", "#F1F1F1FF", "#E4C1D9FF"
#"#DC876CFF", "#FEFEFEFF", "#ADD0B5FF"
))
heatmap <- Heatmap(
matrix,
name = "Correlation",
col = color_scale,
show_row_names = TRUE,
show_column_names = TRUE,
cluster_rows = FALSE,
cluster_columns = FALSE,
column_dend_height = 50,
column_title = title,
row_title = "",
row_names_side = c("left"),
column_title_gp = gpar(fontsize = 16),
row_title_gp = gpar(fontsize = 16),
column_names_rot = 45,
row_names_rot = 0,
cell_fun = function(j, i, x, y, width, height, fill) {
grid.text(x = x, y = y, label = round(matrix[i, j], 2), gp = gpar(fontsize = 10))
},
heatmap_legend_param = list(title = paste(legend," correlation"), direction = "horizontal"))
return(heatmap)
# draw(heatmap,heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
}
mean_by_repStage.fn <-
function(dataset, column_order) {
result <- dataset %>%
pivot_longer(-gene_id, names_to = "stage") %>%
separate(stage, into = c("stage", "rep"), sep = "_") %>%
group_by(gene_id, stage) %>%
summarise(mean = mean(value)) %>% # Calculate the row means for each stage by gene
pivot_wider(names_from = stage, values_from = mean) %>% # Pivot the result to wide format
dplyr::select(gene_id, all_of(column_order)) %>%
ungroup()
return(result)
}07_Xenopus_RNAseq
1 Overview Xenopus Laevis RNA-seq time-series
We downloaded and analysed the data from ‘Constrained vertebrate evolution by pleiotropic genes’ and ‘Genome evolution in the allotetraploid frog Xenopus laevis’. When combining the two time series, we have a total of 32 distinct time points that perfectly overlap with developmental stages present in our TMTcPro proteome. The processing pipeline is the same employed in Fig2.3_frog_RNAseq and Fig4.2_frog_RNAseq_merged_with_Expande
1.0.0.1 Loading libraries
1.0.0.2 Functions and misc
1.1 Preparing tx2gene for Xenopus
Two different times series were download.
Downlaod the XENLA_10.1_GCF_XBmodels.gff3 annotation which combine Xenabase and NCBI-Xenbase gene models. The 3 column has multiple “non-standard” entries, we grep all of them to prepare the transcript to gene annotation files: guide_RNA,lnc_RNA,,mRNA,rRNA,scRNA,snRNA,snoRNA,tRNA,telomerase_RNA,C_gene_segment,D_loop,V_gene_segment,origin_of_replication,pseudogene,transcript
###
# gff3 view
# ID=XBmRNA50369;Parent=XBXL10_1g26894;gene=LOC108718813
###
# selecting for only mRNA does not work with this annotation => losing genes with salmon gene/trasncript quantification
# cat XENLA_10.1_GCF_XBmodels.gff3 | awk -F "\t" 'BEGIN{OFS="\t"}{if($3=="mRNA"){split($9, a, ";"); gsub(/^ID=/, "", a[1]); gsub(/^Parent=/, "", a[2]); gsub(/^gene=|^exception=/, "", a[6]); print a[1],a[2],a[6]}}' > mRNA2gene_XenLa10.1_GCFXBmodels.txt
cat XENLA_10.1_GCF_XBmodels.gff3 | \
awk -F "\t" 'BEGIN{OFS="\t"}{if($3=="guide_RNA" || $3=="lnc_RNA" || $3=="mRNA" || $3=="rRNA" || $3=="scRNA" || $3=="snRNA" || $3=="snoRNA" || $3=="tRNA" || $3=="telomerase_RNA" || $3=="C_gene_segment" || $3=="D_loop" || $3=="V_gene_segment" || $3=="origin_of_replication" || $3=="pseudogene" || $3=="transcript"){split($9, a, ";"); \gsub(/^ID=/, "", a[1]); gsub(/^Parent=/, "", a[2]); gsub(/^gene=|^exception=/, "", a[6]); print a[1],a[2],a[6]}}' > tx2gene_XenLa10.1_GCFXBmodels.txt
# converting gff3 to GTF, re do the annotation step and comapre the results
agat_convert_sp_gff2gtf.pl -gff XENLA_10.1_GCF_XBmodels.gff3 -o XENLA_10.1_GCF_XBmodels_agat.gtf
cat XENLA_10.1_GCF_XBmodels_agat.gtf |
awk -F "\t" 'BEGIN{OFS="\t"}{if($3=="transcript"){split($9, a, ";"); print a[1],a[4],a[2]}}' |
sed 's/gene_id "//g;s/"\t ID "/\t/g;s/"\t transcript_id "/\t/g;s/"//' > tx2gene_XenLa10.1_GCFXBmodels_agat.txt
### XENLA_10.1_GCF_XBmodels.gtf | CDS ==> NCBI-Xenbase gene models (gtf) => not good annotation
# if($3=="exon")
# if($3=="CDS")
# cat XENLA_10.1_GCF_XBmodels.gtf |
# awk -F "\t" 'BEGIN{OFS="\t"}{if($3=="CDS"){split($9, a, ";"); print a[1],a[2],a[3],a[4]}}' |
# sed 's/gene_id "//g;s/"\t gene_name "/\t/g;s/"\t transcript_id "/\t/g;s/"\t transcript_name "//g;s/"//' > XENLA_10.1_GCF_XBmodels.gtf_cds_t2tgene.txtYou can either use tximeta or tximport packages to import Salmon transcript quantification data and summarize it at the gene level.
1.2 Loading quantification data
###############################
#### Salmon quantification files
###############################
# point to Salmon quantification file
dir_frog <- "/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/b.raw_data/transcriptome/xenopus_laevis/"
gff <- "salmon_gffread_gff/"
# gtf <- "salmon_gffread_gtf/"
# List all directories containing data
names <- list.files(dir_frog)
###############################
#### Time series genome evolution
###############################
# read in your study design/sampleTable
samples_ts_genevo <- read_excel(
path = "/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/a.reference_annotation/RNAseq_metadata_frog.xlsx",
col_names = T,
trim_ws = T,
sheet = "time_series_1_genevo",
range = cell_cols("A:G")) |>
clean_names()
samples_ts_genevo <- samples_ts_genevo[1:27,] |> as_tibble()
## Obtain a vector of all filenames including the path
files_genevo <- file.path(dir_frog, paste0(gff,samples_ts_genevo$library_name,"/quant.sf"))
names(files_genevo) <- samples_ts_genevo$dev_stage_name # Rename the files using the new names
print(names(files_genevo)) # Print the new file names [1] "oo12_1" "oo34_1" "oo56_1" "egg_1" "egg_2" "st08_1" "st08_2" "st08_3" "st09_1" "st09_2" "st10_1" "st10_2" "st10_3" "st12_1" "st12_2" "st15_1" "st15_2" "st20_1" "st20_2" "st25_1" "st25_2" "st30_1" "st30_2" "st35_1" "st35_2" "st35_3" "st40_1"
all(file.exists(files_genevo))[1] TRUE
###############################
#### Time series EXPANDE
###############################
# read in your study design/sampleTable
samples_ts_expande <- read_excel(
path = "/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/a.reference_annotation/RNAseq_metadata_frog.xlsx",
col_names = T,
trim_ws = T,
sheet = "time_series_2_expande",
range = cell_cols("A:H")) |>
clean_names()
## Obtain a vector of all filenames including the path
files_expande <- file.path(dir_frog, paste0(gff,samples_ts_expande$library_name,"/quant.sf"))
samples_expande <- samples_ts_expande$dev_stage_name # Rename the files using the new names
names(files_expande) <- samples_expande
print(names(files_expande)) # Print the new file names [1] "st02_1" "st02_2" "st05_1" "st05_2" "st09_1" "st09_2" "st11_1" "st11_2" "st13_1" "st13_2" "st17_1" "st17_2" "st19_1" "st19_2" "st21_1" "st21_2" "st23_1" "st23_2" "st26_1" "st26_2" "st28_1" "st28_2" "st31_1" "st31_2" "st37-38_1" "st37-38_2" "st43_1" "st43_2" "st48_1" "st48_2" "st61_1" "st61_2" "st66_1" "st66_2"
all(file.exists(files_expande))[1] TRUE
###############################
#### Time series merged
###############################
# read in your study design/sampleTable
samples_ts_merged <- read_excel(
path = "/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/a.reference_annotation/RNAseq_metadata_frog.xlsx",
col_names = T,
trim_ws = T,
sheet = "time_series_merged",
range = cell_cols("A:J")) |>
clean_names()
samples_ts_merged <- samples_ts_merged[1:61,] |> as_tibble()
## Obtain a vector of all filenames including the path
files_merged <- file.path(dir_frog, paste0(gff,samples_ts_merged$library_name,"/quant.sf"))
samples_merge_name <- samples_ts_merged$dev_stage_name # Rename the files using the new names
names(files_merged) <- samples_merge_name
print(names(files_merged)) # Print the new file names [1] "oo12_1" "oo34_1" "oo56_1" "egg_1" "egg_2" "st02_1" "st02_2" "st05_1" "st05_2" "st08_1" "st08_2" "st08_3" "st09_1" "st09_2" "st09_3" "st09_4" "st10_1" "st10_2" "st10_3" "st11_1" "st11_2" "st12_1" "st12_2" "st13_1" "st13_2" "st15_1" "st15_2" "st17_1" "st17_2" "st19_1" "st19_2" "st20_1" "st20_2" "st21_1" "st21_2" "st23_1" "st23_2" "st25_1" "st25_2" "st26_1" "st26_2" "st28_1" "st28_2" "st30_1" "st30_2" "st31_1" "st31_2" "st35_1" "st35_2" "st35_3" "st37-38_1" "st37-38_2" "st40_1" "st43_1" "st43_2" "st48_1" "st48_2" "st61_1" "st61_2" "st66_1" "st66_2"
all(file.exists(files_merged))[1] TRUE
###############################
#### Coldata
###############################
# create metadata df
coldata_ts_gene_evo <- data.frame(files=files_genevo, samples_ts_genevo, stringsAsFactors=FALSE) %>% clean_names()
coldata_ts_expande <- data.frame(files=files_expande, samples_ts_expande, stringsAsFactors=FALSE) %>% clean_names()
coldata_ts_merged <- data.frame(files=files_merged, samples_ts_merged, stringsAsFactors=FALSE) %>% clean_names()1.3 tximport
Imports transcript-level abundance, estimated counts and transcript lengths, and summarizes into matrices for use with downstream gene-level analysis packages. Soneson C, Love MI, Robinson MD (2015). “Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences.” F1000Research, 4. doi: 10.12688/f1000research.7563.1. We create annotations using makeTxDbFromGFF and match genes-transcripts IDs
###############################
#### tximport
###############################
tx2gene_gff_frog <- read_delim("/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/a.reference_annotation/frog_XenLa10.1/XENLA_10.1_GCF_XBmodels_tx2tgene.txt")
txi_ts1 <- tximport(files_genevo,
type = "salmon",
tx2gene = tx2gene_gff_frog,
ignoreTxVersion = T,
countsFromAbundance = "lengthScaledTPM")
txi_ts2 <- tximport(files_expande,
type = "salmon",
tx2gene = tx2gene_gff_frog, #tx2gene[,c("tx_id", "ensgene")]
ignoreTxVersion = T,
countsFromAbundance = "lengthScaledTPM")
txi_both <- tximport(files_merged,
type = "salmon",
tx2gene = tx2gene_gff_frog, #tx2gene[,c("tx_id", "ensgene")]
ignoreTxVersion = T,
countsFromAbundance = "lengthScaledTPM")
# if you exported transcript level data and want to append your gene symbols to the data frame
# Txi_trans <- as_tibble(Txi_trans$counts, rownames = "target_id")
# Txi_trans <- left_join(Txi_trans, Tx)
# txi has the matrices of the abundance, counts and length
names(txi_both)[1] "abundance" "counts" "length" "countsFromAbundance"
### Check that sample names match in both files
all(colnames(txi_both$counts) %in% coldata_ts_merged$dev_stage_name)[1] TRUE
1.3.1 TPM ts1
###############################
#### TPM
###############################
TPM_txi_ts1.df <-
as_tibble(txi_ts1$abundance, rownames = NA) %>%
rownames_to_column("gene_id")
# save normalised and gene-level estimated TPM
write.table(TPM_txi_ts1.df, file = "frog_ts1_rna_tpm.tsv", sep = "\t", quote = F,row.names=FALSE, col.names = T)
# 42,042 to 30008
keepTPM <- rowSums(TPM_txi_ts1.df[,-1]) >= 2 # 2 TPM per gene
table(keepTPM)keepTPM
FALSE TRUE
12074 29968
TPM_txi_ts1.df <- TPM_txi_ts1.df[keepTPM,]
write.table(TPM_txi_ts1.df, file="frog_ts1_rna_tpmFilt.tsv", sep="\t", quote=F, row.names = F,col.names=T)
###############################
#### DESeq2
###############################
stages_ts1 <- c("oo12", "oo34", "oo56", "egg", "st08", "st09", "st10", "st12", "st15", "st20", "st25", "st30", "st35", "st40")
coldata_ts_gene_evo$condition <- coldata_ts_gene_evo$short_stage
group <- factor(coldata_ts_gene_evo$condition,levels = stages_ts1)
coldata_ts_gene_evo$condition <- group
dds_ts1 <- DESeqDataSetFromTximport(txi_ts1,
colData = coldata_ts_gene_evo,
design = ~ condition)
dds_ts1 <- estimateSizeFactors(dds_ts1)
sizeFactors(dds_ts1) oo12_1 oo34_1 oo56_1 egg_1 egg_2 st08_1 st08_2 st08_3 st09_1 st09_2 st10_1 st10_2 st10_3 st12_1 st12_2 st15_1 st15_2 st20_1 st20_2 st25_1 st25_2 st30_1 st30_2 st35_1 st35_2 st35_3 st40_1
2.03767395 1.56042078 2.12303127 0.48975437 0.62495824 1.55443948 1.21326408 6.54368122 1.59223618 1.68654819 1.13433200 0.94384874 3.90257506 0.71419557 0.50706537 0.40672291 0.53228633 0.88621026 0.65633804 1.28366672 0.65729847 1.46426451 1.00628501 1.14867224 0.03189453 2.71637337 1.15856811
DESeq_ts1_frog<- counts(dds_ts1, normalized=TRUE) %>% as_tibble(., rownames ="gene_id") # normalised counts
# save normalised and gene-level estimated DESeq2
write.table(DESeq_ts1_frog, file="frog_ts1_DESeq2.tsv", sep="\t", quote=F, row.names = F, col.names = T)
# Filtering low abundance genes. We keep genes that are expressed in at least 2 samples with >= 10
keep_ts1 <- rowSums(assay(dds_ts1) >= 10) >= 2
dds_ts1 <- dds_ts1[keep_ts1,] #16246
dds_ts1 <- estimateSizeFactors(dds_ts1)
sizeFactors(dds_ts1) oo12_1 oo34_1 oo56_1 egg_1 egg_2 st08_1 st08_2 st08_3 st09_1 st09_2 st10_1 st10_2 st10_3 st12_1 st12_2 st15_1 st15_2 st20_1 st20_2 st25_1 st25_2 st30_1 st30_2 st35_1 st35_2 st35_3 st40_1
2.03767395 1.56042078 2.12303127 0.48975437 0.62495824 1.55443948 1.21326408 6.54368122 1.59223618 1.68654819 1.13433200 0.94384874 3.90257506 0.71419557 0.50706537 0.40672291 0.53228633 0.88621026 0.65633804 1.28366672 0.65729847 1.46426451 1.00628501 1.14867224 0.03189453 2.71637337 1.15856811
### Exporting normalized counts
normCounts_ts1 <- counts(dds_ts1, normalized=TRUE)
write.table(normCounts_ts1, file="normalisedDeseq_RNA_frog_ts1.tsv", sep="\t", quote=F, col.names=NA)1.3.2 TPM ts2
###############################
#### TPM
###############################
TPM_txi_ts2.df <-
as_tibble(txi_ts2$abundance, rownames = NA) %>%
rownames_to_column("gene_id")
# save normalised and gene-level estimated TPM
write.table(TPM_txi_ts2.df, file = "frog_ts2_rna_tpm.tsv", sep = "\t", quote = F,row.names=FALSE, col.names = T)
# 42,042 to 30008
keepTPM <- rowSums(TPM_txi_ts2.df[,-1]) >= 2 # 2 TPM per gene
table(keepTPM)keepTPM
FALSE TRUE
6345 35697
TPM_txi_ts2.df <- TPM_txi_ts2.df[keepTPM,]
write.table(TPM_txi_ts2.df, file="frog_ts2_rna_tpmFilt.tsv", sep="\t", quote=F, row.names = F,col.names=T)
###############################
#### DESeq2
###############################
unique_elements <- unique(coldata_ts_expande$short_stage)
unique_elements [1] "st02" "st05" "st09" "st11" "st13" "st17" "st19" "st21" "st23" "st26" "st28" "st31" "st37-38" "st43" "st48" "st61" "st66"
coldata_ts_expande$condition <- coldata_ts_expande$short_stage
group <- factor(coldata_ts_expande$condition,levels = unique_elements)
coldata_ts_expande$condition <- group
dds_ts2 <- DESeqDataSetFromTximport(txi_ts2,
colData = coldata_ts_expande,
design = ~ condition)
dds_ts2 <- estimateSizeFactors(dds_ts2)
sizeFactors(dds_ts2) st02_1 st02_2 st05_1 st05_2 st09_1 st09_2 st11_1 st11_2 st13_1 st13_2 st17_1 st17_2 st19_1 st19_2 st21_1 st21_2 st23_1 st23_2 st26_1 st26_2 st28_1 st28_2 st31_1 st31_2 st37-38_1 st37-38_2 st43_1 st43_2 st48_1 st48_2 st61_1 st61_2 st66_1 st66_2
0.7160142 0.6754361 0.8210946 1.0224680 1.2369617 1.2106088 0.9183008 0.8504560 0.8245654 0.8105487 1.1309582 1.0026527 1.1764919 1.1658180 0.8850037 0.9281650 0.9121465 0.8144108 1.3614312 1.5113700 1.3258554 1.6577136 1.5383287 1.2375204 1.2304822 1.3108729 0.8843944 1.2450906 1.0969132 0.8180797 1.1405791 0.9191203 0.8519913 0.6812380
DESeq_ts2.df <- counts(dds_ts2, normalized=TRUE) %>% as_tibble(., rownames ="gene_id") # normalised counts
# save normalised and gene-level estimated DESeq2
write.table(DESeq_ts2.df, file="frog_ts2_DESeq2.tsv", sep="\t", quote=F, row.names = F, col.names = T)
# Filtering low abundance genes. We keep genes that are expressed in at least 2 samples with >= 10
keep_ts2 <- rowSums(assay(dds_ts2) >= 10) >= 2
dds_ts2 <- dds_ts2[keep_ts2,] #16246
dds_ts2 <- estimateSizeFactors(dds_ts2)
sizeFactors(dds_ts2) st02_1 st02_2 st05_1 st05_2 st09_1 st09_2 st11_1 st11_2 st13_1 st13_2 st17_1 st17_2 st19_1 st19_2 st21_1 st21_2 st23_1 st23_2 st26_1 st26_2 st28_1 st28_2 st31_1 st31_2 st37-38_1 st37-38_2 st43_1 st43_2 st48_1 st48_2 st61_1 st61_2 st66_1 st66_2
0.7159855 0.6753248 0.8210875 1.0221377 1.2369361 1.2109413 0.9182624 0.8505945 0.8245141 0.8105327 1.1310381 1.0028078 1.1766958 1.1658633 0.8850037 0.9281650 0.9121465 0.8143423 1.3614372 1.5113700 1.3257545 1.6577136 1.5383612 1.2375204 1.2304822 1.3111006 0.8843301 1.2450498 1.0963127 0.8180797 1.1403499 0.9189030 0.8518917 0.6811925
### Exporting normalized counts
normCounts_dds_ts2<- counts(dds_ts2, normalized=TRUE)
write.table(normCounts_dds_ts2, file="normalisedDeseq_RNA_frog_ts2.tsv", sep="\t", quote=F, col.names=NA)1.3.3 TPM ts3
###############################
#### TPM
###############################
TPM_txi_both.df <-
as_tibble(txi_both$abundance, rownames = NA) %>%
rownames_to_column("gene_id")
# save normalised and gene-level estimated TPM
write.table(TPM_txi_both.df, file = "frog_both_rna_tpm.tsv", sep = "\t", quote = F,row.names=FALSE, col.names = T)
# 42,042 to 30008
keepTPM <- rowSums(TPM_txi_both.df[,-1]) >= 2 # 2 TPM per gene
table(keepTPM)keepTPM
FALSE TRUE
5479 36563
TPM_txi_both.df <- TPM_txi_both.df[keepTPM,]
write.table(TPM_txi_both.df, file="frog_both_rna_tpmFilt.tsv", sep="\t", quote=F, row.names = F,col.names=T)
###############################
#### DESeq2
###############################
uniqueBoth = unique(coldata_ts_merged$short_stage)
coldata_ts_merged$condition <- coldata_ts_merged$short_stage
group <- factor(coldata_ts_merged$condition,levels = uniqueBoth)
coldata_ts_merged$condition <- group
dds_both <- DESeqDataSetFromTximport(txi_both,
colData = coldata_ts_merged,
design = ~ time_series + condition)
dds_both <- estimateSizeFactors(dds_both)
sizeFactors(dds_both) oo12_1 oo34_1 oo56_1 egg_1 egg_2 st02_1 st02_2 st05_1 st05_2 st08_1 st08_2 st08_3 st09_1 st09_2 st09_3 st09_4 st10_1 st10_2 st10_3 st11_1 st11_2 st12_1 st12_2 st13_1 st13_2 st15_1 st15_2 st17_1 st17_2 st19_1 st19_2 st20_1 st20_2 st21_1 st21_2 st23_1 st23_2 st25_1 st25_2 st26_1 st26_2 st28_1 st28_2 st30_1 st30_2 st31_1 st31_2 st35_1 st35_2 st35_3 st37-38_1 st37-38_2 st40_1 st43_1 st43_2 st48_1 st48_2 st61_1 st61_2 st66_1 st66_2
1.55983814 1.25744306 1.65575739 0.40425895 0.47753355 0.93642574 0.88024098 1.04622495 1.30566213 1.31951216 0.99987136 5.32693762 1.33494565 1.39923222 1.53784059 1.49649321 0.94601462 0.76096445 3.11013853 1.14981243 1.06919617 0.58307748 0.39726936 1.03801594 1.02326132 0.32992465 0.41911703 1.40240895 1.24551166 1.44692524 1.42277249 0.71428681 0.50505884 1.07214252 1.13330232 1.10712159 0.99112590 1.04252031 0.50845900 1.63045030 1.82532445 1.59874232 2.01016623 1.19181502 0.78262200 1.85067916 1.49762976 0.94683628 0.02519423 2.13975449 1.44914416 1.55215301 0.94978848 1.01689825 1.42951258 1.19252635 0.88846275 1.24673010 1.00620109 0.92157448 0.73449299
DESeq_both.df <- counts(dds_both, normalized=TRUE) %>% as_tibble(., rownames ="gene_id") # normalised counts
# save normalised and gene-level estimated DESeq2
write.table(DESeq_both.df, file="frog_both_DESeq2.tsv", sep="\t", quote=F, row.names = F, col.names = T)
# Filtering low abundance genes. We keep genes that are expressed in at least 2 samples with >= 10
keep_both <- rowSums(assay(dds_both) >= 10) >= 2
dds_both <- dds_both[keep_both,] #16246
dds_both <- estimateSizeFactors(dds_both)
sizeFactors(dds_both) oo12_1 oo34_1 oo56_1 egg_1 egg_2 st02_1 st02_2 st05_1 st05_2 st08_1 st08_2 st08_3 st09_1 st09_2 st09_3 st09_4 st10_1 st10_2 st10_3 st11_1 st11_2 st12_1 st12_2 st13_1 st13_2 st15_1 st15_2 st17_1 st17_2 st19_1 st19_2 st20_1 st20_2 st21_1 st21_2 st23_1 st23_2 st25_1 st25_2 st26_1 st26_2 st28_1 st28_2 st30_1 st30_2 st31_1 st31_2 st35_1 st35_2 st35_3 st37-38_1 st37-38_2 st40_1 st43_1 st43_2 st48_1 st48_2 st61_1 st61_2 st66_1 st66_2
1.55983814 1.25744306 1.65575739 0.40425895 0.47753355 0.93642574 0.88024098 1.04622495 1.30566213 1.31951216 0.99987136 5.32693762 1.33494565 1.39923222 1.53784059 1.49649321 0.94601462 0.76096445 3.11013853 1.14981243 1.06919617 0.58307748 0.39726936 1.03801594 1.02326132 0.32992465 0.41911703 1.40240895 1.24551166 1.44692524 1.42277249 0.71428681 0.50505884 1.07214252 1.13330232 1.10712159 0.99112590 1.04252031 0.50845900 1.63045030 1.82532445 1.59874232 2.01016623 1.19181502 0.78262200 1.85067916 1.49762976 0.94683628 0.02519423 2.13975449 1.44914416 1.55215301 0.94978848 1.01689825 1.42951258 1.19252635 0.88846275 1.24673010 1.00620109 0.92157448 0.73449299
### Exporting normalized counts
normCounts_both <- counts(dds_both, normalized=TRUE)
write.table(normCounts_both, file="normalisedDeseq_RNA_frog_both.tsv", sep="\t", quote=F, col.names=NA)1.3.4 PCA
###############################
#### Transform counts for data visualization
###############################
# DESeq2: Variance Stabilising Transformation - log transformed values
rld_ts1 <- vst(dds_ts1, blind=TRUE) #rlog
rld_ts2 <- vst(dds_ts2, blind=TRUE) #rlog
rld_both <- vst(dds_both, blind=TRUE) #rlog
###############################
#### PCA ts1
###############################
rldTs1_mat <- assay(rld_ts1)
pca.res_ts1 <- prcomp(t(rldTs1_mat),center = T)
pc.var_ts1<-pca.res_ts1$sdev^2 # sdev^2 captures these eigenvalues from the PCA result
pc.per_ts1<-round(pc.var_ts1/sum(pc.var_ts1)*100, 1)
pca.res_ts1.df <- as_tibble(pca.res_ts1$x) |>
mutate(group= coldata_ts_gene_evo |> pull(condition)) |>
dplyr::select(group, everything())
coldata_ts_gene_evo <- coldata_ts_gene_evo |> mutate(dev_stage_name = paste(dev_stage_name, replicate, sep = "_"))
labels =coldata_ts_gene_evo %>% pull(dev_stage_name)
color=coldata_ts_gene_evo %>% pull(condition)
pca.plot_ts1 <- ggplot(pca.res_ts1) +
aes(x=PC1, y=PC2,
label= labels,
color = color) +
geom_point(size=4) +
scale_color_viridis_d(option = "C") +
# scale_color_manual(values = ciona_stages_palette) +
xlab(paste0("PC1 (",pc.per_ts1[1],"%",")")) +
ylab(paste0("PC2 (",pc.per_ts1[2],"%",")")) +
labs(color = "Users By labs")+
coord_fixed() +
labs(color = "hpf") +
theme_personalised +
theme(legend.position = c(0.5,0.1),
legend.direction = "horizontal")
pca.plot_ts1 ggplotly(pca.plot_ts1) # drop st40 -- it's an eggcairo_pdf("frog_rna_ts1_pca.pdf", width=8.5, height=5.6)
print(pca.plot_ts1)
invisible(dev.off())
###############################
#### PCA ts2
###############################
rldts2_mat <- assay(rld_ts2)
pca.res_ts2 <- prcomp(t(rldts2_mat),center = T)
pc.var_ts2<-pca.res_ts2$sdev^2 # sdev^2 captures these eigenvalues from the PCA result
pc.per_ts2<-round(pc.var_ts2/sum(pc.var_ts2)*100, 1)
pca.res_ts2.df <- as_tibble(pca.res_ts2$x) |>
mutate(group= coldata_ts_expande |> pull(condition)) |>
dplyr::select(group, everything())
coldata_ts_expande <- coldata_ts_expande |> mutate(dev_stage_name = paste(dev_stage_name, replicate, sep = "_"))
labels =coldata_ts_expande %>% pull(dev_stage_name)
color=coldata_ts_expande %>% pull(condition) #time
pca.plot_ts2 <- ggplot(pca.res_ts2) +
aes(x=PC1, y=PC2,
label= labels,
color = color) +
geom_point(size=4) +
scale_color_viridis_d(option = "C") +
# scale_color_manual(values = ciona_stages_palette) +
xlab(paste0("PC1 (",pc.per_ts2[1],"%",")")) +
ylab(paste0("PC2 (",pc.per_ts2[2],"%",")")) +
labs(color = "Users By labs")+
coord_fixed() +
labs(color = "hpf") +
theme_personalised +
theme(legend.position = c(0.5,0.1),
legend.direction = "horizontal")
pca.plot_ts2 ggplotly(pca.plot_ts2) # drop st40 -- it's an eggcairo_pdf("frog_rna_ts2_pca.pdf", width=8.5, height=5.6)
print(pca.plot_ts2)
invisible(dev.off())
###############################
#### PCA both
###############################
rldboth_mat <- assay(rld_both) #[, !grepl("^adult", colnames(assay(rld)))]
pca.res_both <- prcomp(t(rldboth_mat),center = T)
pc.var_both<-pca.res_both$sdev^2 # sdev^2 captures these eigenvalues from the PCA result
pc.per_both<-round(pc.var_both/sum(pc.var_both)*100, 1)
pca.res_both.df <- as_tibble(pca.res_both$x) |>
mutate(group= coldata_ts_merged |> pull(condition)) |>
dplyr::select(group, everything())
coldata_ts_merged <- coldata_ts_merged |> mutate(dev_stage_name = paste(dev_stage_name, replicate, sep = "_"))
labels =coldata_ts_merged %>% pull(dev_stage_name)
color=coldata_ts_merged %>% pull(condition) #time
pca.plot_both <- ggplot(pca.res_both) +
aes(x=PC1, y=PC2,
label= labels,
color = color) +
geom_point(size=4) +
scale_color_viridis_d(option = "C") +
# scale_color_manual(values = ciona_stages_palette) +
xlab(paste0("PC1 (",pc.per_both[1],"%",")")) +
ylab(paste0("PC2 (",pc.per_both[2],"%",")")) +
labs(color = "Users By labs")+
coord_fixed() +
labs(color = "hpf") +
theme_personalised +
theme(legend.position = c(0.5,0.1),
legend.direction = "horizontal")
pca.plot_bothggplotly(pca.plot_both) color=coldata_ts_merged %>% pull(time_series)
pca.plot_bothPaper <- ggplot(pca.res_both) +
aes(x=PC1, y=PC2,
label= labels,
color = color) +
geom_point(size=4) +
scale_color_viridis_d(option = "C") +
# scale_color_manual(values = ciona_stages_palette) +
xlab(paste0("PC1 (",pc.per_both[1],"%",")")) +
ylab(paste0("PC2 (",pc.per_both[2],"%",")")) +
labs(color = "Users By labs")+
coord_fixed() +
labs(color = "hpf") +
theme_personalised +
theme(legend.position = c(0.5,0.1),
legend.direction = "horizontal")
pca.plot_bothPaperggplotly(pca.plot_bothPaper) cairo_pdf("frog_rna_both_pca.pdf", width=8.5, height=5.6)
print(pca.plot_bothPaper)
invisible(dev.off())1.3.5 Correlation
###############################
#### Heatmap correlation ts1
###############################
### Compute pairwise correlation values
stages_ts1= TPM_txi_ts1.df %>% pivot_longer(-1, names_to = "stage", values_to = "expression") %>% pull(stage) |> unique()
log2.ts1.df <-
TPM_txi_ts1.df %>% pivot_longer(-1, names_to = "stage", values_to = "expression") %>%
mutate(expression = log2(expression + 1)) %>%
mutate(stage = factor(stage,stages_ts1)) |>
pivot_wider(names_from = stage,values_from = expression)
###############################
#### with average biological replicates
###############################
stages_ts1_mean= coldata_ts_gene_evo %>% pull(short_stage) |> unique()
ts1_meanRep.df <- mean_by_repStage.fn(TPM_txi_ts1.df,stages_ts1_mean)
# TPM meanRep filt log2
log2.ts1_meanRep.df_meanRep.df <-
ts1_meanRep.df %>% pivot_longer(-1, names_to = "stage", values_to = "expression") %>%
mutate(expression = log2(expression + 1)) %>%
mutate(stage = factor(stage,stages_ts1_mean)) |>
pivot_wider(names_from = stage,values_from = expression)
### Compute pairwise correlation values and then order matrix
TPM_ts1_cor_spearman <- cor(log2.ts1.df[,-1],method = "spearman")
TPM_ts1_cor_spearman <- TPM_ts1_cor_spearman[stages_ts1,(stages_ts1)]
TPM_ts1_mean_cor_spearman <- cor(log2.ts1_meanRep.df_meanRep.df[,-1],method = "spearman")
TPM_ts1_mean_cor_spearman <- TPM_ts1_mean_cor_spearman[stages_ts1_mean,(stages_ts1_mean)]
heatmap_TPM_ts1_spearman <- heatmap_function(TPM_ts1_cor_spearman,"TPM ts1","Spearman")
heatmap_TPM_ts1_spearmanheatmap_TPM_ts1mean_spearman <- heatmap_function(TPM_ts1_mean_cor_spearman,"TPM","Spearman")
heatmap_TPM_ts1mean_spearman#__________________________________________________________________________
###############################
#### Heatmap correlation ts2
###############################
### Compute pairwise correlation values
stages_ts2= TPM_txi_ts2.df %>% pivot_longer(-1, names_to = "stage", values_to = "expression") %>% pull(stage) |> unique()
log2.ts2.df <-
TPM_txi_ts2.df %>% pivot_longer(-1, names_to = "stage", values_to = "expression") %>%
mutate(expression = log2(expression + 1)) %>%
mutate(stage = factor(stage,stages_ts2)) |>
pivot_wider(names_from = stage,values_from = expression)
###############################
#### with average biological replicates
###############################
stages_ts2_mean = coldata_ts_expande %>% pull(short_stage) |> unique()
ts2_meanRep.df <- mean_by_repStage.fn(TPM_txi_ts2.df,stages_ts2_mean)
# TPM meanRep filt log2
log2.ts2_meanRep.df_meanRep.df <-
ts2_meanRep.df %>% pivot_longer(-1, names_to = "stage", values_to = "expression") %>%
mutate(expression = log2(expression + 1)) %>%
mutate(stage = factor(stage,stages_ts2_mean)) |>
pivot_wider(names_from = stage,values_from = expression)
### Compute pairwise correlation values and then order matrix
TPM_ts2_cor_spearman <- cor(log2.ts2.df[,-1],method = "spearman")
TPM_ts2_cor_spearman <- TPM_ts2_cor_spearman[stages_ts2,(stages_ts2)]
TPM_ts2_mean_cor_spearman <- cor(log2.ts2_meanRep.df_meanRep.df[,-1],method = "spearman")
TPM_ts2_mean_cor_spearman <- TPM_ts2_mean_cor_spearman[stages_ts2_mean,(stages_ts2_mean)]
heatmap_TPM_ts2_spearman <- heatmap_function(TPM_ts2_cor_spearman,"TPM ts2","Spearman")
heatmap_TPM_ts2_spearmanheatmap_TPM_ts2mean_spearman <- heatmap_function(TPM_ts2_mean_cor_spearman,"TPM","Spearman")
heatmap_TPM_ts2mean_spearman#__________________________________________________________________________
###############################
#### Heatmap correlation ts both
###############################
### Compute pairwise correlation values
stages_both= TPM_txi_both.df %>% pivot_longer(-1, names_to = "stage", values_to = "expression") %>% pull(stage) |> unique()
log2.both.df <-
TPM_txi_both.df %>% pivot_longer(-1, names_to = "stage", values_to = "expression") %>%
mutate(expression = log2(expression + 1)) %>%
mutate(stage = factor(stage,stages_both)) |>
pivot_wider(names_from = stage,values_from = expression)
###############################
#### with average biological replicates
###############################
stages_both_mean= coldata_ts_merged %>% pull(short_stage) |> unique()
# stages_both_mean= TPM_txi_both.df |> pivot_longer(-gene_id, names_to = "stage") %>%
# separate(stage, into = c("stage", "rep"), sep = "_") |> pull("stage")
both_meanRep.df <- mean_by_repStage.fn(TPM_txi_both.df,stages_both_mean)
# TPM meanRep filt log2
log2.both_meanRep.df_meanRep.df <-
both_meanRep.df %>% pivot_longer(-1, names_to = "stage", values_to = "expression") %>%
mutate(expression = log2(expression + 1)) %>%
mutate(stage = factor(stage,stages_both_mean)) |>
pivot_wider(names_from = stage,values_from = expression)
### Compute pairwise correlation values and then order matrix
TPM_both_cor_spearman <- cor(log2.both.df[,-1],method = "spearman")
TPM_both_cor_spearman <- TPM_both_cor_spearman[stages_both,(stages_both)]
TPM_both_mean_cor_spearman <- cor(log2.both_meanRep.df_meanRep.df[,-1],method = "spearman")
TPM_both_mean_cor_spearman <- TPM_both_mean_cor_spearman[stages_both_mean,(stages_both_mean)]
heatmap_TPM_both_spearman <- heatmap_function(TPM_both_cor_spearman,"TPM both","Spearman")
heatmap_TPM_both_spearmanheatmap_TPM_both_mean_spearman <- heatmap_function(TPM_both_mean_cor_spearman,"TPM","Spearman")
heatmap_TPM_both_mean_spearman#__________________________________________________________________________
cairo_pdf("rna_ts1Mean_spearman.pdf", width=8.5, height=8.5)
print(draw(heatmap_TPMts1_mean_spearman,
column_title = "Spearman",
column_title_gp = gpar(fontsize = 16),
heatmap_legend_side = "bottom",
annotation_legend_side = "bottom") )Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'draw': object 'heatmap_TPMts1_mean_spearman' not found
invisible(dev.off())
cairo_pdf("rna_ts2Mean_spearman.pdf", width=8.5, height=8.5)
print(draw(heatmap_TPMts2_mean_spearman,
column_title = "Spearman",
column_title_gp = gpar(fontsize = 16),
heatmap_legend_side = "bottom",
annotation_legend_side = "bottom") )Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'draw': object 'heatmap_TPMts2_mean_spearman' not found
invisible(dev.off())
cairo_pdf("rna_tsbothMean_spearman.pdf", width=8.5, height=8.5)
print(draw(heatmap_TPM_both_mean_spearman,
column_title = "Spearman",
column_title_gp = gpar(fontsize = 16),
heatmap_legend_side = "bottom",
annotation_legend_side = "bottom"))
invisible(dev.off())